The data files used in this project are from the Census Bureau:
Read in appropriate packages:
library(tidyverse)
library(readr)
Read in first data sheet (EDU01a)
library(readr)
sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
sheet1
Select columns (Area_name as area_name, STCOU, any column ending
in “D”)
sheet1 <- sheet1 %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename("area_name" = Area_name)
sheet1
Convert to long format
sheet1 <- sheet1 %>%
pivot_longer(cols=3:12, names_to="edu", values_to = "enrollment")
sheet1
Parse the new column “edu” to pull out the year and convert to
YYYY format Grab the first 7 characters of the new column “edu”
# In the first step we drop the "D" from the "edu" column and then split the resulting column into two new columns - "edu" overwritten to now contain the first three characters and four digits of the old "edu" column, and "yr" which contains the last two digits of the year for the enrollment value
sheet1 <- sheet1 %>%
mutate(edu=substr(sheet1$edu, 1, 9)) %>%
separate(edu, into=c("edu_code", "yr"), sep=7, remove = TRUE, convert = FALSE)
# In the second step, we evaluate if the new column "yr" is greater than 50. If so, we paste0 "19" to it's value and force the value to be read as numeric; if the column value is less than 50, we paste "20" to the value and force the value to be read as numeric.
sheet1 <- sheet1 %>%
mutate(yr = if_else(sheet1$yr > 50, as.numeric(paste0("19", sheet1$yr, sep="")), as.numeric(paste0("20", sheet1$yr, sep=""))))
sheet1
Create two data sets:
county containing only the county-level rowsstate containing the non-county level rowsAdd appropriate class to each
county_df <- sheet1[grep(pattern = ", \\w\\w", sheet1$area_name), ]
state_df <- sheet1[-c(grep(patter = ", \\w\\w", sheet1$area_name)), ]
class(county_df) <- c("county", class(county_df))
class(state_df) <- c("state", class(state_df))
county_df
state_df
Add column containing state information for the county-level
tibble
county_df <- county_df %>%
mutate("state" = substr(county_df$area_name, nchar(county_df$area_name)-1, nchar(county_df$area_name)))
county_df
Add column to non-county tibble corresponding to the state’s
classification of division
# created a vector for each regional division containing the names of the states within the division
new_england <- c("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
mid_atlantic <- c("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
east_north_central <- c("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
west_north_central <- c("IOWA", "KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
# included District of Columbia in the South Atlantic Division since both Maryland and Virginia are in this division
south_atlantic <- c("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "DISTRICT OF COLUMBIA", "WEST VIRGINIA", "District of Columbia")
east_south_central <- c("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
west_south_central <- c("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
mountain <- c("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
pacific <- c("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")
# created division column using if/else logic
state_df <- state_df %>%
mutate("division" = if_else(area_name %in% new_england, "New England",
if_else(area_name %in% mid_atlantic, "Mid Atlantic",
if_else(area_name %in% east_north_central, "East North Central",
if_else(area_name %in% west_north_central, "West North Central",
if_else(area_name %in% south_atlantic, "South Atlantic",
if_else(area_name %in% east_south_central, "East South Central",
if_else(area_name %in% west_south_central, "West South Central",
if_else(area_name %in% mountain, "Mountain",
if_else(area_name %in% pacific, "Pacific", "ERROR"))))))))))
state_df
Read in second data sheet (EDU01b)
sheet2 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
Write a function to select/rename columns and pivot to long
format
# included option argument to name column with enrollment data
select_and_pivot <- function(sheet, var_name = "population") {
df <- sheet %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename("area_name" = Area_name) %>%
pivot_longer(cols=3:12, names_to="edu", values_to = var_name)
return(df)
}
census <- select_and_pivot(sheet2, var_name = "enroll")
census
Write a function that takes the output from
select_and_pivot() and creates a year column and a
census description column (edu_code)
create_edu_yr <- function(sheet) {
dfa <- sheet %>%
mutate(edu=substr(sheet$edu, 1, 9)) %>%
separate(edu, into=c("edu_code", "yr"), sep=7, remove = TRUE, convert = FALSE)
dfb <- dfa %>%
mutate(yr = if_else(dfa$yr > 50, as.numeric(paste0("19", dfa$yr, sep="")), as.numeric(paste0("20", dfa$yr, sep=""))))
return(dfb)
}
census <- create_edu_yr(census)
census
Write a function that will add a state information column
(state) to the county-level tibble
create_state <- function(sheet) {
df <- sheet %>%
mutate("state" = substr(sheet$area_name, nchar(sheet$area_name)-1, nchar(sheet$area_name)))
return(df)
}
Write a function that will add a division classification column (division) to the non-county tibble
create_division <- function(sheet) {
df <- sheet %>%
mutate("division" = if_else(area_name %in% new_england, "New England",
if_else(area_name %in% mid_atlantic, "Mid Atlantic",
if_else(area_name %in% east_north_central, "East North Central",
if_else(area_name %in% west_north_central, "West North Central",
if_else(area_name %in% south_atlantic, "South Atlantic",
if_else(area_name %in% east_south_central, "East South Central",
if_else(area_name %in% west_south_central, "West South Central",
if_else(area_name %in% mountain, "Mountain",
if_else(area_name %in% pacific, "Pacific", "ERROR"))))))))))
return(df)
}
Write a function that takes the output from
create_edu_yr() and creates two tibbes - one for county
rows and another for non-county rows. Add state column to
county tibble and add division classification to non-county
tibble. Return one object with two tibbles
split_county_state <- function(sheet) {
county <- sheet[grep(pattern = ", \\w\\w", sheet$area_name), ]
class(county) <- c("county", class(county))
county <- create_state(county)
state <- sheet[-c(grep(patter = ", \\w\\w", sheet$area_name)), ]
class(state) <- c("state", class(state))
state <- create_division(state)
return(list(county = county, state = state))
}
census <- split_county_state(census)
census$county
census$state
Write a wrapper function that will take in a URL
address to one CSV file and return two tibbles in a single object
process_data <- function(url, var_name = "population") {
a <- read_csv(url)
b <- select_and_pivot(a, var_name)
c <- create_edu_yr(b)
d <- split_county_state(c)
return(d)
}
Call the wrapper function twice, once for each census file (EDU01a & EDU01b)
edu_01a <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", var_name = "pop")
edu_01a
## $county
## # A tibble: 31,450 × 6
## area_name STCOU edu_code yr pop state
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Autauga, AL 01001 EDU0101 1987 6829 AL
## 2 Autauga, AL 01001 EDU0101 1988 6900 AL
## 3 Autauga, AL 01001 EDU0101 1989 6920 AL
## 4 Autauga, AL 01001 EDU0101 1990 6847 AL
## 5 Autauga, AL 01001 EDU0101 1991 7008 AL
## 6 Autauga, AL 01001 EDU0101 1992 7137 AL
## 7 Autauga, AL 01001 EDU0101 1993 7152 AL
## 8 Autauga, AL 01001 EDU0101 1994 7381 AL
## 9 Autauga, AL 01001 EDU0101 1995 7568 AL
## 10 Autauga, AL 01001 EDU0101 1996 7834 AL
## # ℹ 31,440 more rows
##
## $state
## # A tibble: 530 × 6
## area_name STCOU edu_code yr pop division
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 UNITED STATES 00000 EDU0101 1987 40024299 ERROR
## 2 UNITED STATES 00000 EDU0101 1988 39967624 ERROR
## 3 UNITED STATES 00000 EDU0101 1989 40317775 ERROR
## 4 UNITED STATES 00000 EDU0101 1990 40737600 ERROR
## 5 UNITED STATES 00000 EDU0101 1991 41385442 ERROR
## 6 UNITED STATES 00000 EDU0101 1992 42088151 ERROR
## 7 UNITED STATES 00000 EDU0101 1993 42724710 ERROR
## 8 UNITED STATES 00000 EDU0101 1994 43369917 ERROR
## 9 UNITED STATES 00000 EDU0101 1995 43993459 ERROR
## 10 UNITED STATES 00000 EDU0101 1996 44715737 ERROR
## # ℹ 520 more rows
edu_01b <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", var_name = "pop")
edu_01b
## $county
## # A tibble: 31,450 × 6
## area_name STCOU edu_code yr pop state
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Autauga, AL 01001 EDU0101 1997 8099 AL
## 2 Autauga, AL 01001 EDU0101 1998 8211 AL
## 3 Autauga, AL 01001 EDU0101 1999 8489 AL
## 4 Autauga, AL 01001 EDU0102 2000 8912 AL
## 5 Autauga, AL 01001 EDU0102 2001 8626 AL
## 6 Autauga, AL 01001 EDU0102 2002 8762 AL
## 7 Autauga, AL 01001 EDU0152 2003 9105 AL
## 8 Autauga, AL 01001 EDU0152 2004 9200 AL
## 9 Autauga, AL 01001 EDU0152 2005 9559 AL
## 10 Autauga, AL 01001 EDU0152 2006 9652 AL
## # ℹ 31,440 more rows
##
## $state
## # A tibble: 530 × 6
## area_name STCOU edu_code yr pop division
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 UNITED STATES 00000 EDU0101 1997 44534459 ERROR
## 2 UNITED STATES 00000 EDU0101 1998 46245814 ERROR
## 3 UNITED STATES 00000 EDU0101 1999 46368903 ERROR
## 4 UNITED STATES 00000 EDU0102 2000 46818690 ERROR
## 5 UNITED STATES 00000 EDU0102 2001 47127066 ERROR
## 6 UNITED STATES 00000 EDU0102 2002 47606570 ERROR
## 7 UNITED STATES 00000 EDU0152 2003 48506317 ERROR
## 8 UNITED STATES 00000 EDU0152 2004 48693287 ERROR
## 9 UNITED STATES 00000 EDU0152 2005 48978555 ERROR
## 10 UNITED STATES 00000 EDU0152 2006 49140702 ERROR
## # ℹ 520 more rows
Write a function to combine the county and non-county tibbles of
the above outputs
combine_data <- function(data_1, data_2) {
a <- bind_rows(data_1$county, data_2$county)
b <- bind_rows(data_1$state, data_2$state)
return(list(county = a,state = b))
}
Call combine function
edu_01 <- combine_data(edu_01a, edu_01b)
edu_01$county
edu_01$state
state plotting method:
# calculates the mean of the var_name (population) by regional division and year
# filters out all rows were division == "ERROR"
plot.state <- function(df, var_name = "population") {
new_df <- df %>%
group_by(division, yr) %>%
summarise(ENRmean = mean(get(var_name))) %>%
filter(division != "ERROR")
# plots a line graph of the mean var_name (population) by year for each regional division
ggplot(new_df, aes(x = yr, y = ENRmean, color = division)) +
geom_line() + scale_y_continuous(labels = scales::comma) + ylab(var_name)
}
county plotting method
# order defaults to "top", so if order argument is left blank or if "top" is entered, the function will return dataframe in descending order. If "bottom" is entered for the argument, the dataframe will return in ascending order. If anything other than "top" or "bottom" is entered, the function will STOP and request user to enter appropriate argument
plot.county <- function(df, var_name = "population", st = "IL", order = "top", n = 5) {
if(order == "top"){
new_df <- df %>%
filter(state == st) %>%
group_by(area_name) %>%
summarise(ENRmean = mean(get(var_name))) %>%
arrange(desc(ENRmean))
}
else if(order == "bottom"){
new_df <- df %>%
filter(state == st) %>%
group_by(area_name) %>%
summarise(ENRmean = mean(get(var_name))) %>%
arrange(ENRmean)
}
else{
stop("Specify Order (top or bottom)")
}
# the above function returns the highest or lowest means of var_name (population) grouped by county for the state entered in the argument.
# the below function filters the original input dataframe based on the area_names of the above results, so that the new dataframe (filter_df) contains non-aggregated data for the counties identified in the above results
new_df <- new_df[1:n, ]
filter_df <- df %>%
filter(area_name %in% new_df$area_name)
# plots a line graph of the var_name (population) for each year for the top or bottom 'n' counties for the state entered in the function argument
ggplot(filter_df, aes(x = yr, y = get(var_name), color = area_name)) +
geom_line() + scale_y_continuous(labels = scales::comma) + ylab(var_name)
}
Run the data processing function on the two EDU URLs
edu_01a <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", var_name = "pop")
edu_01b <- process_data("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", var_name = "pop")
Combine function
edu_01 <- combine_data(edu_01a, edu_01b)
Plot state dataframe
plot(edu_01$state, var_name = "pop")
Plot county dataframe - North Carolina
plot(edu_01$county, "pop", "NC", "top", 10)
Plot county dataframe - Arizona
plot(edu_01$county, "pop", "AZ", "bottom", 6)
Plot county dataframe - defaults
# Have to specify var_name since the plotting default ("population") would not match data source ("pop")
plot(edu_01$county, "pop")
Plot county dataframe - Ohio
plot(edu_01$county, "pop", "OH", "top", 8)
Run the data processing function on the four PST URLs
pst_01a <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv")
pst_01b <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv")
pst_01c <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv")
pst_01d <- process_data("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv")
Combine function
pst_01 <- combine_data(pst_01a, pst_01b)
pst_01 <- combine_data(pst_01c, pst_01)
pst_01 <- combine_data(pst_01d, pst_01)
Plot state dataframe
plot(pst_01$state)
Plot county dataframe - Pennsylvania
plot(pst_01$county,,"PA",, 5)
Plot county dataframe - Texas
plot(pst_01$county,,"TX","bottom", 12)
Plot county dataframe - default
plot(pst_01$county)
Plot county dataframe - New York
plot(pst_01$county,,"NY",, 6)